In [4]:
import os, sys
import numpy as np
import pandas as pd
import pandas_profiling
import codecs

import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.inspection import permutation_importance
import multiprocessing

from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV   #####Revisar esta librería.
from sklearn.neural_network import MLPClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from IPython.display import Image  
from sklearn.tree import export_graphviz
from joblib import dump, load
In [5]:
dataTrain = pd.read_excel("train.xlsx")
dataTest = pd.read_excel("test.xlsx")
googleResultsTrain = pd.read_csv("resultados_google_train.csv")
googleResultsTest = pd.read_csv("resultados_google_test.csv")
spanishCorrector_Train = pd.read_csv("SpanishCorrector_Train.csv")
spanishCorrector_Test = pd.read_csv("SpanishCorrector_Test.csv")

with codecs.open('positive_words_es.txt','r',encoding='utf8') as f:
    positive_words= f.read()
    
positive_words=positive_words.split('\r\n')

with codecs.open('negative_words_es.txt','r',encoding='utf8') as f:
    negative_words= f.read()
    
negative_words=negative_words.split('\r\n')
In [6]:
dataTrain.head()
Out[6]:
Id Category Topic Source Headline Text Link %MayusculasHeadLine #ResultadosGoogle #ResultadosGoogleNews ... Palabras unicas/palabras totales #Mistakes #Numeros %Numeros #Comillas %Comillas #SignosInterrogación %SignosInterrogación %SignosExclamación #SignosExclamación
0 1 Fake Education El Ruinaversal RAE INCLUIRÁ LA PALABRA "LADY" EN EL DICCIONAR... RAE INCLUIRÁ LA PALABRA "LADY" EN EL DICCIONAR... http://www.elruinaversal.com/2017/06/10/rae-in... NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2 Fake Education Hay noticia La palabra "haiga", aceptada por la RAE La palabra "haiga", aceptada por la RAE La Rea... https://haynoticia.es/la-palabra-haiga-aceptad... NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 3 Fake Education El Ruinaversal YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L... YORDI ROSADO ESCRIBIRÁ Y DISEÑARÁ LOS NUEVOS L... http://www.elruinaversal.com/2018/05/06/yordi-... NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 4 True Education EL UNIVERSAL UNAM capacitará a maestros para aprobar prueba... UNAM capacitará a maestros para aprobar prueba... http://www.eluniversal.com.mx/articulo/nacion/... NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 5 Fake Education Lamula pretenden aprobar libros escolares con conteni... Alerta: pretenden aprobar libros escolares con... https://redaccion.lamula.pe/2018/06/19/memoria... NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 25 columns

In [7]:
# % Mayusculas
def mayusculas(str):
    numMayus = sum(1 for c in str if c.isupper())
    return numMayus/sum(1 for c in str)
In [8]:
# total de signos de interrogación
def numInterrogacionTot(str):
    count = 0
    for i in range (0, len (str)):   
    #Checks whether given character is a punctuation mark  
        if str[i] in ("¿","?"):  
            count = count + 1;
    return count

# porcentaje signos de interrogacion        
def numInterrogacionRel(str):
    count = 0
    for i in range (0, len (str)):   
    #Checks whether given character is a punctuation mark  
        if str[i] in ("¿","?"):  
            count = count + 1;
    return count/sum(1 for c in str)
In [9]:
# total de signos de exclamacion
def numExclamacionTot(str):
    count = 0
    for i in range (0, len (str)):   
    #Checks whether given character is a punctuation mark  
        if str[i] in ("¡","!"):  
            count = count + 1;
    return count

# porcentaje signos de exclamacion        
def numExclamacionRel(str):
    count = 0
    for i in range (0, len (str)):   
    #Checks whether given character is a punctuation mark  
        if str[i] in ("¡","!"):  
            count = count + 1;
    return count/sum(1 for c in str)
In [10]:
# palabras positivas
def positiveTot(str): 
    # break the string into list of words 
    str = str.split()
    # loop till string values present in list str 
    count = 0
    for i in range (0, len (str)):
        if str[i] in positive_words:
            count = count + 1;
    return count

def positiveRel(str): 
    # break the string into list of words 
    str = str.split()
    # loop till string values present in list str 
    count = 0
    for i in range (0, len (str)):
        if str[i] in positive_words:  
                count = count + 1;
            
            
    return count/len(str)
In [11]:
# palabras negativas
def negativeTot(str): 
    # break the string into list of words 
    str = str.split()
    # loop till string values present in list str 
    count = 0
    for i in range (0, len (str)):
        if str[i] in negative_words:
            count = count + 1;
    return count

def negativeRel(str): 
    # break the string into list of words 
    str = str.split()
    # loop till string values present in list str 
    count = 0
    for i in range (0, len (str)):
        if str[i] in negative_words:  
                count = count + 1
    return count/len(str)
In [12]:
# palabras unicas / palabras totales
def redundancia(str): 
    # break the string into list of words 
    str = str.split()
    str2 = [] 
    # loop till string values present in list str 
    for i in range (0, len (str)): 
        # checking for the duplicacy 
        if str[i] not in str2: 
            # insert value in str2 
            str2.append(str[i])
 
    rta = len(str2)/len(str) 
            
    return rta
In [13]:
#Contar *NUMBER*
def num(str): 
    # break the string into list of words 
    str = str.split()
    str2 = [] 
    # loop till string values present in list str 
    for i in str: 
        # checking for the duplicacy 
        if i not in str2: 
            # insert value in str2 
            str2.append(i)
    num = 0
    for i in range(0, len(str2)):
        if str2[i] == "*NUMBER*":
            num = str.count(str2[i])
        else:
            continue
            
    return num

def numRel(str): 
    # break the string into list of words 
    str = str.split()
    str2 = [] 
    # loop till string values present in list str 
    for i in str: 
        # checking for the duplicacy 
        if i not in str2: 
            # insert value in str2 
            str2.append(i)
    num = 0
    for i in range(0, len(str2)):
        if str2[i] == "*NUMBER*":
            num = str.count(str2[i])
        else:
            continue
            
    return num/len(str)
In [14]:
def numResults(str):
    if(str=="No results"):
        return 0
    else:
        # Quitar "Cerca de"
        str = str.replace("Cerca de", "")
        # Obtener el número
        str = str.split()
        num= str[0]
        num = num.replace(",", "")
        return int(num)
In [15]:
# total de comillas
def numComillasTot(str):
    count = 0
    for i in range (0, len (str)):   
    #Checks whether given character is a punctuation mark  
        if str[i] in ("«","»","“","”","‘","’","'","\""):  
            count = count + 1;
    return count

# porcentaje comillas        
def numComillasRel(str):
    count = 0
    for i in range (0, len (str)):   
    #Checks whether given character is a punctuation mark  
        if str[i] in ("«","»","“","”","‘","’","'","\""):  
            count = count + 1;
    return count/sum(1 for c in str)
In [16]:
dataTrain['Category'] = dataTrain['Category'].apply(lambda x: 1 if x=="True" else 0)
dataTrain['%MayusculasHeadLine'] = dataTrain['Headline'].apply(lambda x: mayusculas(x))
dataTrain['#SignosInterrogación'] = dataTrain['Text'].apply(lambda x: numInterrogacionTot(x))
dataTrain['%SignosInterrogación'] = dataTrain['Text'].apply(lambda x: numInterrogacionRel(x))
dataTrain['#SignosExclamación'] = dataTrain['Text'].apply(lambda x: numExclamacionTot(x))
dataTrain['%SignosExclamación'] = dataTrain['Text'].apply(lambda x: numExclamacionRel(x))
dataTrain['#PalabrasPositivas'] = dataTrain['Text'].apply(lambda x: positiveTot(x))
dataTrain['%PalabrasPositivas'] = dataTrain['Text'].apply(lambda x: positiveRel(x))
dataTrain['#PalabrasNegativas'] = dataTrain['Text'].apply(lambda x: negativeTot(x))
dataTrain['%PalabrasNegativas'] = dataTrain['Text'].apply(lambda x: negativeRel(x))
dataTrain['Palabras unicas/palabras totales'] = dataTrain['Text'].apply(lambda x: redundancia(x))
dataTrain['#Numeros'] = dataTrain['Text'].apply(lambda x: num(x))
dataTrain['%Numeros'] = dataTrain['Text'].apply(lambda x: numRel(x))
dataTrain['#Comillas'] = dataTrain['Text'].apply(lambda x: numComillasTot(x))
dataTrain['%Comillas'] = dataTrain['Text'].apply(lambda x: numComillasRel(x))
dataTrain['#ResultadosGoogle'] = googleResultsTrain['GSearch'].apply(lambda x: numResults(x))
dataTrain['#ResultadosGoogleNews'] = googleResultsTrain['GSearchNews'].apply(lambda x: numResults(x))
dataTrain['0ResultadosGoogleNews'] = dataTrain['#ResultadosGoogleNews'].apply(lambda x: 1 if x==0 else 0)
dataTrain['#Mistakes'] = spanishCorrector_Train['Inconsistency']+spanishCorrector_Train['Grammar']+spanishCorrector_Train['Typographical']+spanishCorrector_Train['Spacing']
In [17]:
dataTest['Category'] = dataTest['Category'].apply(lambda x: 1 if x=="True" else 0)
dataTest['%MayusculasHeadLine'] = dataTest['Headline'].apply(lambda x: mayusculas(x))
dataTest['#SignosInterrogación'] = dataTest['Text'].apply(lambda x: numInterrogacionTot(x))
dataTest['%SignosInterrogación'] = dataTest['Text'].apply(lambda x: numInterrogacionRel(x))
dataTest['#SignosExclamación'] = dataTest['Text'].apply(lambda x: numExclamacionTot(x))
dataTest['%SignosExclamación'] = dataTest['Text'].apply(lambda x: numExclamacionRel(x))
dataTest['#PalabrasPositivas'] = dataTest['Text'].apply(lambda x: positiveTot(x))
dataTest['%PalabrasPositivas'] = dataTest['Text'].apply(lambda x: positiveRel(x))
dataTest['#PalabrasNegativas'] = dataTest['Text'].apply(lambda x: negativeTot(x))
dataTest['%PalabrasNegativas'] = dataTest['Text'].apply(lambda x: negativeRel(x))
dataTest['Palabras unicas/palabras totales'] = dataTest['Text'].apply(lambda x: redundancia(x))
dataTest['#Numeros'] = dataTest['Text'].apply(lambda x: num(x))
dataTest['%Numeros'] = dataTest['Text'].apply(lambda x: numRel(x))
dataTest['#Comillas'] = dataTest['Text'].apply(lambda x: numComillasTot(x))
dataTest['%Comillas'] = dataTest['Text'].apply(lambda x: numComillasRel(x))
dataTest['#ResultadosGoogle'] = googleResultsTest['GSearch'].apply(lambda x: numResults(x))
dataTest['#ResultadosGoogleNews'] = googleResultsTest['GSearchNews'].apply(lambda x: numResults(x))
dataTest['0ResultadosGoogleNews'] = dataTest['#ResultadosGoogleNews'].apply(lambda x: 1 if x==0 else 0)
dataTest['#Mistakes'] = spanishCorrector_Test['Inconsistency']+spanishCorrector_Test['Grammar']+spanishCorrector_Test['Typographical']+spanishCorrector_Test['Spacing']
In [18]:
dataTrain.loc[3, :]
Out[18]:
Id                                                                                  4
Category                                                                            1
Topic                                                                       Education
Source                                                                   EL UNIVERSAL
Headline                            UNAM capacitará a maestros para aprobar prueba...
Text                                UNAM capacitará a maestros para aprobar prueba...
Link                                http://www.eluniversal.com.mx/articulo/nacion/...
%MayusculasHeadLine                                                         0.0980392
#ResultadosGoogle                                                               63500
#ResultadosGoogleNews                                                              10
0ResultadosGoogleNews                                                               0
#PalabrasPositivas                                                                 12
%PalabrasPositivas                                                          0.0718563
#PalabrasNegativas                                                                  8
%PalabrasNegativas                                                          0.0479042
Palabras unicas/palabras totales                                             0.568862
#Mistakes                                                                           1
#Numeros                                                                            0
%Numeros                                                                            0
#Comillas                                                                           0
%Comillas                                                                           0
#SignosInterrogación                                                                0
%SignosInterrogación                                                                0
%SignosExclamación                                                                  0
#SignosExclamación                                                                  0
Name: 3, dtype: object
In [19]:
pandas_profiling.ProfileReport(dataTrain)



Out[19]:

División de los datos

In [20]:
Y_train = dataTrain['Category']
X_train = dataTrain.drop(columns = 'Category')
X_train = dataTrain.loc[:,['%MayusculasHeadLine','#SignosInterrogación','%SignosInterrogación','%SignosExclamación',
                    '#SignosExclamación','#ResultadosGoogle','#ResultadosGoogleNews','0ResultadosGoogleNews','#PalabrasPositivas',
                    '%PalabrasPositivas','#PalabrasNegativas','%PalabrasNegativas','Palabras unicas/palabras totales',
                    '#Numeros','%Numeros','#Comillas','%Comillas','#Mistakes']]
Y_test = dataTest['Category']
X_test = dataTest.drop(columns = 'Category')
X_test = dataTest.loc[:,['%MayusculasHeadLine','#SignosInterrogación','%SignosInterrogación','%SignosExclamación',
                    '#SignosExclamación','#ResultadosGoogle','#ResultadosGoogleNews','0ResultadosGoogleNews','#PalabrasPositivas',
                    '%PalabrasPositivas','#PalabrasNegativas','%PalabrasNegativas','Palabras unicas/palabras totales',
                    '#Numeros','%Numeros','#Comillas','%Comillas','#Mistakes']]
In [21]:
print (X_train.shape)
print (Y_train.shape)
print (X_test.shape)
print (Y_test.shape)
(676, 18)
(676,)
(295, 18)
(295,)

Grid Search basado en validación cruzada

In [22]:
# Grid de hiperparámetros evaluados
# ==============================================================================
param_grid = {'n_estimators': [40, 60, 100, 150],
              'max_features': [9, 12, 14, 16, 20, 22],
              'max_depth'   : [None, 3, 10, 20],
              'criterion'   : ['gini', 'entropy']
             }

# Búsqueda por grid search con validación cruzada
# ==============================================================================
grid = GridSearchCV(
        estimator  = RandomForestClassifier(random_state = 123),
        param_grid = param_grid,
        scoring    = 'f1',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=123), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = Y_train)

# Resultados
# ==============================================================================
resultados = pd.DataFrame(grid.cv_results_)
resultados.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)
Out[22]:
param_criterion param_max_depth param_max_features param_n_estimators mean_test_score std_test_score mean_train_score std_train_score
150 entropy 10 12 100 0.787463 0.022419 0.997043 0.003312
151 entropy 10 12 150 0.787353 0.026929 0.997890 0.001926
147 entropy 10 9 150 0.785548 0.021822 0.996047 0.004436
175 entropy 20 12 150 0.785380 0.027346 1.000000 0.000000
In [23]:
# Mejores hiperparámetros por validación cruzada
# ==============================================================================
print("----------------------------------------")
print("Mejores hiperparámetros encontrados (cv)")
print("----------------------------------------")
print(grid.best_params_, ":", grid.best_score_, grid.scoring)
----------------------------------------
Mejores hiperparámetros encontrados (cv)
----------------------------------------
{'criterion': 'entropy', 'max_depth': 10, 'max_features': 12, 'n_estimators': 100} : 0.7874628753562462 f1
In [24]:
modelo_final = grid.best_estimator_
In [25]:
# Error de test del modelo final
# ==============================================================================
predicciones = modelo_final.predict(X = X_test)
predicciones[:10]
Out[25]:
array([0, 1, 1, 1, 1, 1, 0, 1, 1, 1], dtype=int64)
In [26]:
mat_confusion = confusion_matrix(
                    y_true    = Y_test,
                    y_pred    = predicciones
                )

accuracy = accuracy_score(
            y_true    = Y_test,
            y_pred    = predicciones,
            normalize = True
           )

print("Matriz de confusión")
print("-------------------")
print(mat_confusion)
print("")
print(f"El accuracy de test es: {100 * accuracy} %")
Matriz de confusión
-------------------
[[104  38]
 [ 32 121]]

El accuracy de test es: 76.27118644067797 %
In [27]:
print(
    classification_report(
        y_true = Y_test,
        y_pred = predicciones
    )
)
              precision    recall  f1-score   support

           0       0.76      0.73      0.75       142
           1       0.76      0.79      0.78       153

    accuracy                           0.76       295
   macro avg       0.76      0.76      0.76       295
weighted avg       0.76      0.76      0.76       295

In [28]:
importancia_predictores = pd.DataFrame(
                            {'predictor': X_train.columns,
                             'importancia': modelo_final.feature_importances_}
                            )
print("Importancia de los predictores en el modelo")
print("-------------------------------------------")
importancia_predictores.sort_values('importancia', ascending=False)
Importancia de los predictores en el modelo
-------------------------------------------
Out[28]:
predictor importancia
12 Palabras unicas/palabras totales 0.208947
0 %MayusculasHeadLine 0.158633
13 #Numeros 0.094058
14 %Numeros 0.081388
6 #ResultadosGoogleNews 0.069416
5 #ResultadosGoogle 0.063832
16 %Comillas 0.062557
15 #Comillas 0.049743
11 %PalabrasNegativas 0.046647
9 %PalabrasPositivas 0.042315
10 #PalabrasNegativas 0.037244
8 #PalabrasPositivas 0.027383
17 #Mistakes 0.015298
7 0ResultadosGoogleNews 0.013404
2 %SignosInterrogación 0.010269
3 %SignosExclamación 0.008019
1 #SignosInterrogación 0.005858
4 #SignosExclamación 0.004987
In [29]:
dump(modelo_final, 'model.pkl')
Out[29]:
['model.pkl']
In [ ]: